In [2]:
import data_utils
import pandas as pd
import numpy as np
from data_utils import *
import os
In [100]:
#Load data 
file_path = 'bq-results1.csv'

df = load_fraud_data(file_path)

CLUSTERED BARCHART VISUALISATION OF TRANSACTION BY FRAUD CLASS

In [5]:
import plotly.express as px
import pandas as pd

fraud_list = df[df['isFraud'] == 1]
non_fraud_list = df[df['isFraud'] == 0]
fraud_x = fraud_list['type'].value_counts().index.tolist()
fraud_y = fraud_list['type'].value_counts().values.tolist()
nonfraud_x = non_fraud_list['type'].value_counts().index.tolist()
nonfraud_y = non_fraud_list['type'].value_counts().values.tolist()

fraud_data = pd.DataFrame({'Transaction_Type': fraud_x, 'Count': fraud_y, 'Name': 'Fraud'})
non_fraud_data = pd.DataFrame({'Transaction_Type': nonfraud_x, 'Count': nonfraud_y, 'Name': 'Non-Fraud'})

combined_data = pd.concat([fraud_data, non_fraud_data], ignore_index=True)

fig = px.bar(combined_data, x='Transaction_Type', y='Count', color='Name', color_discrete_map={'Fraud': 'red', 'Non-Fraud': 'grey'})

fig.update_layout(
    title='Clustered Bar Chart of Transaction Types',
    xaxis_title='Transaction Type',
    yaxis_title='Log Count',
    barmode='group',  
    height=500,
    width=800,
)
# Apply logarithmic scale to the y-axis
fig.update_yaxes(type="log", exponentformat='none')

fig.show()

MULTIVARIATE ANALYSIS

1.CORRELATION MATRIX

In [179]:
# Correlation Matrix
correlation_matrix = df.corr(numeric_only=True)
print('CORRELATION')
correlation_matrix
CORRELATION
Out[179]:
step amount oldbalanceOrg newbalanceOrig oldbalanceDest newbalanceDest isFraud isFlaggedFraud
step 1.000000 0.022373 -0.010058 -0.010299 0.027665 0.025888 0.031578 0.003277
amount 0.022373 1.000000 -0.002762 -0.007861 0.294137 0.459304 0.076688 0.012295
oldbalanceOrg -0.010058 -0.002762 1.000000 0.998803 0.066243 0.042029 0.010154 0.003835
newbalanceOrig -0.010299 -0.007861 0.998803 1.000000 0.067812 0.041837 -0.008148 0.003776
oldbalanceDest 0.027665 0.294137 0.066243 0.067812 1.000000 0.976569 -0.005885 -0.000513
newbalanceDest 0.025888 0.459304 0.042029 0.041837 0.976569 1.000000 0.000535 -0.000529
isFraud 0.031578 0.076688 0.010154 -0.008148 -0.005885 0.000535 1.000000 0.044109
isFlaggedFraud 0.003277 0.012295 0.003835 0.003776 -0.000513 -0.000529 0.044109 1.000000
  1. HEAT MAP VISULAISATION OF CORRELATION MATRIX
In [7]:
fig3 = go.Figure(data=[go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.index,
    colorscale='Cividis',  # Updated colorscale
    zmin=-1,
    zmax=1,
    hoverongaps=False
)])

# Update layout for a professional, publication-quality plot
fig3 = go.Figure(data=[go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.index,
    colorscale='Cividis',
    zmin=-1,
    zmax=1,
    hoverongaps=False
)])

# Add text annotations for label inside each box
for i, row in enumerate(correlation_matrix.index):
    for j, col in enumerate(correlation_matrix.columns):
        fig3.add_annotation(
            x=col, 
            y=row, 
            text=f"{correlation_matrix.loc[row, col]:.4f}", 
            font=dict(size=12),
            showarrow=False,
            xref="x",
            yref="y"
        )

# Update layout for a professional, publication-quality plot
fig3.update_layout(
    title='Correlation Heatmap',
    xaxis=dict(tickmode='linear'),
    yaxis=dict(tickmode='linear'),
    font=dict(
        family="Arial, monospace",
        size=14,
        color="#000000"
    ),
    margin=dict(l=100, r=100, t=100, b=100),
    paper_bgcolor="white",
    autosize=False,
    width=1100,
    height=700
)

# Show the figure
fig3.show()

DATA TRANSFORMATION¶

1.FILTERING OF TRANSACTION TYPES WITH BOTH FRAUD CLASSES

In [102]:
filtered_df = filter_fraud_data(df)
filtered_df.head()
Out[102]:
step type amount nameOrig oldbalanceOrg newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud
4 162 CASH_OUT 219742.61 C493470768 25575.0 0.00 C861627372 323624.20 543366.81 0 0
5 133 CASH_OUT 219742.55 C860234025 0.0 0.00 C221750228 1365483.09 1569242.65 0 0
8 279 TRANSFER 219742.27 C1763304788 0.0 0.00 C290822335 759457.06 979199.33 0 0
9 257 CASH_OUT 219742.26 C1788227639 29035.0 0.00 C1568862543 686283.62 906025.88 0 0
10 352 CASH_OUT 219742.19 C1880833503 904462.0 684719.81 C1915634371 748163.19 967905.38 0 0
  1. PIE CHART VISULAISATION OF FRAUD DISTRIBTUION BY TRANSACTION TYPES
In [10]:
plot_pie_chart(df=filtered_df, column_name='type', title='Distribution of Fraud Transaction Types')
  1. BINARY ENCODING
In [106]:
transformed_df = transform_transaction_type(df=filtered_df, column_name='type')
transformed_df.head()
Out[106]:
step type amount nameOrig oldbalanceOrg newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud
4 162 1 219742.61 C493470768 25575.0 0.00 C861627372 323624.20 543366.81 0 0
5 133 1 219742.55 C860234025 0.0 0.00 C221750228 1365483.09 1569242.65 0 0
8 279 1 219742.27 C1763304788 0.0 0.00 C290822335 759457.06 979199.33 0 0
9 257 1 219742.26 C1788227639 29035.0 0.00 C1568862543 686283.62 906025.88 0 0
10 352 1 219742.19 C1880833503 904462.0 684719.81 C1915634371 748163.19 967905.38 0 0
  1. CATEGORICAL ENCODING
In [107]:
transformed_df = data_encoding(df=transformed_df, column1='nameOrig', column2='nameDest')
In [108]:
transformed_df.head()
Out[108]:
step type amount nameOrig oldbalanceOrg newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud
4 162 1 219742.61 2041924 25575.0 0.00 473203 323624.20 543366.81 0 0
5 133 1 219742.55 2568121 0.0 0.00 304416 1365483.09 1569242.65 0 0
8 279 1 219742.27 1091742 0.0 0.00 322753 759457.06 979199.33 0 0
9 257 1 219742.26 1127672 29035.0 0.00 149889 686283.62 906025.88 0 0
10 352 1 219742.19 1260399 904462.0 684719.81 241197 748163.19 967905.38 0 0

FEATURE SELECTION¶

  1. ANOVA F-Test
In [109]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
features = transformed_df[['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 
                'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest',
                'isFlaggedFraud']]

target = transformed_df['isFraud']

# Assuming 'features' and 'target' are already defined in your environment
# Selecting best features using ANOVA F-test
best_features = SelectKBest(score_func=f_classif, k='all')
fit = best_features.fit(features, target)

# Creating DataFrame for feature scores
featureScores = pd.DataFrame(data=fit.scores_, index=list(features.columns), columns=['ANOVA Score'])
featureScores = featureScores.sort_values(ascending=False, by='ANOVA Score')

# Creating subplots for heatmap visualization
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(6, 5))

# Define a color palette
colors = "YlGnBu"

# First subplot for the first 5 features
plt.subplot(1, 2, 1)
sns.heatmap(featureScores.iloc[:6, :], annot=True, cmap=colors, linewidths=0.4, linecolor='black', cbar=False, fmt='.2f')
plt.title('ANOVA Score : Part 1')

# Second subplot for the next 5 features
plt.subplot(1, 2, 2)
sns.heatmap(featureScores.iloc[6:13, :], annot=True, cmap=colors, linewidths=0.4, linecolor='black', cbar=False, fmt='.2f')
plt.title('ANOVA Score : Part 2')

# Adjust layout
fig.tight_layout(w_pad=2)

# Display the plot
plt.show()
C:\Users\Administrator\AppData\Roaming\Python\Python311\site-packages\sklearn\feature_selection\_univariate_selection.py:112: UserWarning:

Features [1] are constant.

C:\Users\Administrator\AppData\Roaming\Python\Python311\site-packages\sklearn\feature_selection\_univariate_selection.py:113: RuntimeWarning:

invalid value encountered in divide

In [110]:
featureScores = featureScores.sort_values(by='ANOVA Score', ascending=False)
featureScores
Out[110]:
ANOVA Score
oldbalanceOrg 380694.276437
amount 13901.637857
newbalanceOrig 11236.549146
step 6578.263516
isFlaggedFraud 5391.619385
oldbalanceDest 620.175363
newbalanceDest 223.308306
nameOrig 1.505560
nameDest 1.020476
type NaN
In [111]:
# Example usage:
df_fraud = transformed_df
threshold = 0.7
correlated_features = find_highly_correlated_features(df_fraud, threshold)

# Print out the highly correlated feature pairs
for feature, correlations in correlated_features.items():
    print(f"{feature} is highly correlated with {correlations}")
oldbalanceOrg is highly correlated with ['newbalanceOrig']
newbalanceOrig is highly correlated with ['oldbalanceOrg']
oldbalanceDest is highly correlated with ['newbalanceDest']
newbalanceDest is highly correlated with ['oldbalanceDest']
  1. Mutual Information
In [55]:
sorted_feature_scores_df, fig = feature_selection_and_plot(df=df_fraud , target_col='isFraud')
In [81]:
fig.show()
sorted_feature_scores_df
Out[81]:
Feature Score
0 type 0.254569
1 step 0.012048
2 oldbalanceOrg 0.008912
3 newbalanceDest 0.005851
4 amount 0.003426
5 oldbalanceDest 0.002059
6 nameDest 0.000797
7 newbalanceOrig 0.000431
8 isFlaggedFraud 0.000075
9 nameOrig 0.000000

FEATURE ENGINEERING¶

1 New TransactionFeature Formation: 'bal_chg', 'orig_zero', 'amt_bal_ratio', 'chg_amt_ratio'

In [112]:
transformed_df  = generate_transaction_features( transformed_df)
transformed_df.head()
Out[112]:
step type amount nameOrig oldbalanceOrg newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud bal_chg orig_zero amt_bal_ratio chg_amt_ratio
4 162 1 219742.61 2041924 25575.0 0.00 473203 323624.20 543366.81 0 0 -25575.00 0 8.592086 -0.116386
5 133 1 219742.55 2568121 0.0 0.00 304416 1365483.09 1569242.65 0 0 0.00 1 0.000000 0.000000
8 279 1 219742.27 1091742 0.0 0.00 322753 759457.06 979199.33 0 0 0.00 1 0.000000 0.000000
9 257 1 219742.26 1127672 29035.0 0.00 149889 686283.62 906025.88 0 0 -29035.00 0 7.568185 -0.132132
10 352 1 219742.19 1260399 904462.0 684719.81 241197 748163.19 967905.38 0 0 -219742.19 0 0.242953 -1.000000
In [113]:
# drop highly correlated columns: drop newbalanceOrig
col_to_drop = ['newbalanceOrig','nameDest', 'oldbalanceDest', 'newbalanceDest','isFlaggedFraud','nameOrig']

drop_columns(df=transformed_df, columns_to_drop=col_to_drop)
transformed_df.head()
Out[113]:
step type amount oldbalanceOrg isFraud bal_chg orig_zero amt_bal_ratio chg_amt_ratio
4 162 1 219742.61 25575.0 0 -25575.00 0 8.592086 -0.116386
5 133 1 219742.55 0.0 0 0.00 1 0.000000 0.000000
8 279 1 219742.27 0.0 0 0.00 1 0.000000 0.000000
9 257 1 219742.26 29035.0 0 -29035.00 0 7.568185 -0.132132
10 352 1 219742.19 904462.0 0 -219742.19 0 0.242953 -1.000000
In [122]:
df_selected =  transformed_df.dropna()
df_selected.head()
Out[122]:
step type amount oldbalanceOrg isFraud bal_chg orig_zero amt_bal_ratio chg_amt_ratio
4 162 1 219742.61 25575.0 0 -25575.00 0 8.592086 -0.116386
5 133 1 219742.55 0.0 0 0.00 1 0.000000 0.000000
8 279 1 219742.27 0.0 0 0.00 1 0.000000 0.000000
9 257 1 219742.26 29035.0 0 -29035.00 0 7.568185 -0.132132
10 352 1 219742.19 904462.0 0 -219742.19 0 0.242953 -1.000000

2 Descriptive Statistics and Heatmaps for Fraud and Non-Fraud Samples

In [126]:
plot_descriptive_stats_heatmaps(df_selected, 'isFraud')
In [238]:
fraud_stats = df_selected[df_selected['isFraud'] == 1].describe().T
nofraud_stats = df_selected[df_selected['isFraud'] == 0].describe().T

fraud_mean = fraud_stats['mean']
fraud_mean = fraud_mean.drop('isFraud', errors='ignore')

nofraud_mean = nofraud_stats['mean']
nofraud_mean = nofraud_mean.drop('isFraud', errors='ignore')

fraud_class_observation = fraud_mean - nofraud_mean
In [243]:
import plotly.graph_objects as go
import pandas as pd

# Sample data - replace with your actual fraud_mean and nofraud_mean

df_viz = pd.DataFrame({'Feature': fraud_mean.index, 'Fraud Mean': fraud_mean.values, 'No Fraud Mean': nofraud_mean.values})

# Dot Plot with specific color for Fraud
fig_dot = go.Figure()
fig_dot.add_trace(go.Scatter(x=df_viz['Feature'], y=df_viz['Fraud Mean'], mode='markers', name='Fraud', marker_color='red'))
fig_dot.add_trace(go.Scatter(x=df_viz['Feature'], y=df_viz['No Fraud Mean'], mode='markers', name='No Fraud', marker_color='blue'))
fig_dot.update_layout(title="Scatter Plot of Fraud and No Fraud Means", xaxis_title="Feature", yaxis_title="Mean Value")
fig_dot.show()

# Creating a DataFrame for plotting
df_fraud = pd.DataFrame({'Feature': fraud_mean.index, 'Log Mean Value': np.log1p(fraud_mean.values), 'Class': 'Fraud'})
df_nofraud = pd.DataFrame({'Feature': nofraud_mean.index, 'Log Mean Value': np.log1p(nofraud_mean.values), 'Class': 'No Fraud'})
df_combined = pd.concat([df_fraud, df_nofraud])

# Creating the scatter plot with specific colors and log-transformed mean values
fig_scatter = px.scatter(df_combined, x='Feature', y='Log Mean Value', color='Class', 
                         color_discrete_map={'Fraud': 'red', 'No Fraud': 'blue'},
                         title='Scatter Plot of Log-Transformed Fraud and No Fraud Means',
                         opacity=0.5)
fig_scatter.show()
C:\Users\Administrator\AppData\Local\Temp\ipykernel_3084\2164487125.py:16: RuntimeWarning:

invalid value encountered in log1p

C:\Users\Administrator\AppData\Local\Temp\ipykernel_3084\2164487125.py:17: RuntimeWarning:

invalid value encountered in log1p

In [242]:
print(f'\nFraud Class Mean')
print((fraud_stats['mean']).round(2))
print(f'\nNo Fraud Class Mean\n')
print((nofraud_stats['mean']).round(2))

print(f'\nMean Difference')
print(fraud_class_observation)
Fraud Class Mean
step                 368.08
type                   1.00
amount           1470832.67
oldbalanceOrg    1652887.65
isFraud                1.00
bal_chg         -1460119.48
orig_zero              0.00
amt_bal_ratio          1.00
chg_amt_ratio         -0.99
Name: mean, dtype: float64

No Fraud Class Mean

step                241.63
type                  1.00
amount           314115.50
oldbalanceOrg     42879.69
isFraud               0.00
bal_chg          -27311.99
orig_zero             0.47
amt_bal_ratio       145.97
chg_amt_ratio        -0.19
Name: mean, dtype: float64

Mean Difference
step             1.264515e+02
type             0.000000e+00
amount           1.156717e+06
oldbalanceOrg    1.610008e+06
bal_chg         -1.432807e+06
orig_zero       -4.706822e-01
amt_bal_ratio   -1.449771e+02
chg_amt_ratio   -8.078642e-01
Name: mean, dtype: float64
In [ ]:
df_selected.to_csv('fraud_data.csv', index=False)